Data Inspection¶

In [1]:
import plotly
plotly.offline.init_notebook_mode()
In [2]:
import pandas as pd
from tqdm import tqdm

df_list = list()
chunk_iter = pd.read_csv(
    "../data/Total_Data_10Y_Top24.csv", 
    chunksize=100000, 
    dtype = {
        "CANCELLATION_CODE": str,
    }
)
for chunk in tqdm(chunk_iter):
    df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:30,  4.23it/s]
Out[2]:
FL_DATE OP_UNIQUE_CARRIER TAIL_NUM ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEST ... TAXI_IN ARR_TIME ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
0 2014-07-01 AA N002AA 1105703 31057 CLT Charlotte, NC 1129803 30194 DFW ... 28.0 1214.0 9.0 0.0 NaN NaN NaN NaN NaN NaN
1 2014-07-01 AA N002AA 1129803 30194 DFW Dallas/Fort Worth, TX 1105703 31057 CLT ... 13.0 945.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
2 2014-07-01 AA N004AA 1039705 30397 ATL Atlanta, GA 1129803 30194 DFW ... 6.0 1341.0 -9.0 0.0 NaN NaN NaN NaN NaN NaN
3 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 7.0 1159.0 4.0 0.0 NaN NaN NaN NaN NaN NaN
4 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 6.0 2317.0 2.0 0.0 NaN NaN NaN NaN NaN NaN

5 rows × 24 columns

Data Cleaning¶

In [3]:
df.columns
Out[3]:
Index(['FL_DATE', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'ORIGIN_AIRPORT_SEQ_ID',
       'ORIGIN_CITY_MARKET_ID', 'ORIGIN', 'ORIGIN_CITY_NAME',
       'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME',
       'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'TAXI_IN', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'CANCELLATION_CODE', 'CARRIER_DELAY', 'WEATHER_DELAY',
       'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY'],
      dtype='object')
In [4]:
import matplotlib.pyplot as plt
from datetime import time

target_df = df.drop(columns = [
    "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN",
    "ORIGIN_CITY_NAME",
    "DEST_AIRPORT_SEQ_ID",
    "DEST_CITY_MARKET_ID",
    "DEST",
    "DEST_CITY_NAME",
    "DEP_TIME",
    "TAXI_OUT",
    "TAXI_IN",
    "ARR_TIME",
])
del df
In [5]:
target_df['FL_DATE'] = pd.to_datetime(target_df['FL_DATE'])
In [6]:
airline_names = pd.read_csv("../supplementary/top_18_airline_review_link.csv")
airline_names
Out[6]:
index OP_UNIQUE_CARRIER count CARRIER CARRIERNAME matched_name urls
0 15 WN 63426 WN Southwest Airlines Co. Southwest Airlines https://www.airlinequality.com/airline-reviews...
1 4 DL 43959 DL Delta Air Lines Inc. Delta Air Lines https://www.airlinequality.com/airline-reviews...
2 1 AA 41637 AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews...
3 12 OO 34846 OO SkyWest Airlines Inc. SkyWest Airlines https://www.airlinequality.com/airline-reviews...
4 14 UA 28030 UA United Air Lines Inc. United Airlines https://www.airlinequality.com/airline-reviews...
5 3 B6 12805 B6 JetBlue Airways Jetblue Airways https://www.airlinequality.com/airline-reviews...
6 2 AS 9965 AS Alaska Airlines Inc. Alaska Airlines https://www.airlinequality.com/airline-reviews...
7 10 NK 8757 NK Spirit Air Lines Spirit Airlines https://www.airlinequality.com/airline-reviews...
8 6 F9 6270 F9 Frontier Airlines Inc. Frontier Airlines https://www.airlinequality.com/airline-reviews...
9 8 HA 3679 HA Hawaiian Airlines Inc. Hawaiian Airlines https://www.airlinequality.com/airline-reviews...
10 7 G4 3477 G4 Allegiant Air Allegiant Air https://www.airlinequality.com/airline-reviews...
11 13 QX 1000 QX Horizon Air Horizon Air https://www.airlinequality.com/airline-reviews...
In [7]:
airport_set_df = target_df.OP_UNIQUE_CARRIER.drop_duplicates()
airport_review = pd.read_csv("../supplementary/Airline-Reviews-Table.csv")
airport_review = airport_review[airport_review.AIRLINE_CODE.isin(airport_set_df)]

airport_default_score = airport_review.groupby("AIRLINE_CODE")[[
    'ratingValue',
    'seatComfort', 
    'foodBeverage', 
    'cabinService', 
    'inflightEntertainment', 
    'wifiConnectivity', 
    'groundService', 
    'valueForMoney',
    'sentiment'
]].mean().reset_index()

airport_review_count = airport_review.groupby("AIRLINE_CODE").size().reset_index().rename(columns = {0:"reviewCount"})
airport_review_merged = pd.merge(airport_review_count, airport_default_score, how = "inner", on = "AIRLINE_CODE")
airport_review_merged = pd.merge(airport_review_merged, airline_names, how = "inner", left_on = "AIRLINE_CODE", right_on = "OP_UNIQUE_CARRIER")
airport_review_merged.head()
C:\Users\wongh\AppData\Local\Temp\ipykernel_7400\2395432350.py:2: DtypeWarning:

Columns (14) have mixed types. Specify dtype option on import or set low_memory=False.

Out[7]:
AIRLINE_CODE reviewCount ratingValue seatComfort foodBeverage cabinService inflightEntertainment wifiConnectivity groundService valueForMoney sentiment index OP_UNIQUE_CARRIER count CARRIER CARRIERNAME matched_name urls
0 AA 5000 0.214920 1.960084 1.860507 2.159644 1.973280 1.692893 1.638175 1.591600 -0.785200 1 AA 41637 AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews...
1 AS 794 0.376952 2.656944 2.542751 2.850350 2.487298 2.416867 2.352863 2.287154 -0.400504 2 AS 9965 AS Alaska Airlines Inc. Alaska Airlines https://www.airlinequality.com/airline-reviews...
2 B6 1422 0.279817 2.480945 2.373869 2.474046 2.544898 2.353963 1.906773 1.931083 -0.637131 3 B6 12805 B6 JetBlue Airways Jetblue Airways https://www.airlinequality.com/airline-reviews...
3 DL 2667 0.357593 2.567887 2.569762 2.851837 2.887197 2.320190 2.324231 2.224681 -0.432321 4 DL 43959 DL Delta Air Lines Inc. Delta Air Lines https://www.airlinequality.com/airline-reviews...
4 F9 3311 0.158774 1.443140 1.258427 1.772019 1.050761 1.061650 1.361199 1.380248 -0.887043 6 F9 6270 F9 Frontier Airlines Inc. Frontier Airlines https://www.airlinequality.com/airline-reviews...
In [8]:
target_airport_df = pd.merge(
    target_df, 
    airport_review_merged,
    how = "inner", 
    left_on = "OP_UNIQUE_CARRIER", 
    right_on = "AIRLINE_CODE",
    suffixes = ["", "_origin"]
)
target_airport_df = pd.merge(
    target_airport_df,
    airline_names,
    how = "inner",
    on = "OP_UNIQUE_CARRIER",
)
target_airport_df.rename(columns = {
    'reviewCount':'Review Count', 
    'ratingValue':'Rating Value', 
    'seatComfort': "Seat Comfort", 
    'foodBeverage': "Food & Beverage", 
    'cabinService': "Cabin Service", 
    'inflightEntertainment': "Inflight Entertainment", 
    'wifiConnectivity': "Wifi Connectivity", 
    'groundService': "Ground Service", 
    'valueForMoney': "Value for Money",
    'sentiment': "Sentiment"
}, inplace = True)
target_airport_df.drop(columns = ['OP_UNIQUE_CARRIER'], inplace = True)
target_airport_df['CANCELLATION_CODE'] = target_airport_df['CANCELLATION_CODE'].fillna("Not")
target_airport_df.head()
Out[8]:
FL_DATE DEP_DELAY ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY ... CARRIER_x CARRIERNAME_x matched_name_x urls_x index_y count_y CARRIER_y CARRIERNAME_y matched_name_y urls_y
0 2014-07-01 -5.0 9.0 0.0 Not NaN NaN NaN NaN NaN ... AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews... 1 41637 AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews...
1 2014-07-01 -4.0 0.0 0.0 Not NaN NaN NaN NaN NaN ... AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews... 1 41637 AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews...
2 2014-07-01 -2.0 -9.0 0.0 Not NaN NaN NaN NaN NaN ... AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews... 1 41637 AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews...
3 2014-07-01 14.0 4.0 0.0 Not NaN NaN NaN NaN NaN ... AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews... 1 41637 AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews...
4 2014-07-01 1.0 2.0 0.0 Not NaN NaN NaN NaN NaN ... AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews... 1 41637 AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews...

5 rows × 34 columns

Visualization¶

Airport Review¶

First of all, let's have a look at the Review Data we collected

In [9]:
airport_review_merged.head()
Out[9]:
AIRLINE_CODE reviewCount ratingValue seatComfort foodBeverage cabinService inflightEntertainment wifiConnectivity groundService valueForMoney sentiment index OP_UNIQUE_CARRIER count CARRIER CARRIERNAME matched_name urls
0 AA 5000 0.214920 1.960084 1.860507 2.159644 1.973280 1.692893 1.638175 1.591600 -0.785200 1 AA 41637 AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews...
1 AS 794 0.376952 2.656944 2.542751 2.850350 2.487298 2.416867 2.352863 2.287154 -0.400504 2 AS 9965 AS Alaska Airlines Inc. Alaska Airlines https://www.airlinequality.com/airline-reviews...
2 B6 1422 0.279817 2.480945 2.373869 2.474046 2.544898 2.353963 1.906773 1.931083 -0.637131 3 B6 12805 B6 JetBlue Airways Jetblue Airways https://www.airlinequality.com/airline-reviews...
3 DL 2667 0.357593 2.567887 2.569762 2.851837 2.887197 2.320190 2.324231 2.224681 -0.432321 4 DL 43959 DL Delta Air Lines Inc. Delta Air Lines https://www.airlinequality.com/airline-reviews...
4 F9 3311 0.158774 1.443140 1.258427 1.772019 1.050761 1.061650 1.361199 1.380248 -0.887043 6 F9 6270 F9 Frontier Airlines Inc. Frontier Airlines https://www.airlinequality.com/airline-reviews...
In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

corr_matrix = airport_review_merged.loc[:, 'ratingValue':'sentiment'].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', mask = mask, vmin = -1, vmax = 1)
plt.title('Airline Review Score Correlation')
plt.savefig("Airline_Review_Correlation")
No description has been provided for this image

Summary

The majority of the score given by users are very correlated,
quite a bit better than airport views but still.

Airport Ranking¶

In [13]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots 
In [14]:
fig = go.Figure()

fig = px.scatter(
    data_frame = airport_review_merged,
    x='sentiment', 
    y='ratingValue',
    color='ratingValue',
    size = 'reviewCount',
    color_continuous_scale = 'viridis',
    text='CARRIERNAME',
    log_y = True
)
fig.update_traces(textposition='top center')
fig.for_each_annotation(
    lambda a: a.update(text= "Medium Airport" if a.text == "Large=False" else "Large Airport")
)
fig.update_layout(
    height=600, 
    width=1000, 
    title_text="Airport Sentiment Visualization", 
    showlegend=False
)
fig.show()
In [13]:
airport_review_merged
Out[13]:
AIRLINE_CODE reviewCount ratingValue seatComfort foodBeverage cabinService inflightEntertainment wifiConnectivity groundService valueForMoney sentiment index OP_UNIQUE_CARRIER count CARRIER CARRIERNAME matched_name urls
0 AA 5000 0.214920 1.960084 1.860507 2.159644 1.973280 1.692893 1.638175 1.591600 -0.785200 1 AA 41637 AA American Airlines Inc. American Airlines https://www.airlinequality.com/airline-reviews...
1 AS 794 0.376952 2.656944 2.542751 2.850350 2.487298 2.416867 2.352863 2.287154 -0.400504 2 AS 9965 AS Alaska Airlines Inc. Alaska Airlines https://www.airlinequality.com/airline-reviews...
2 B6 1422 0.279817 2.480945 2.373869 2.474046 2.544898 2.353963 1.906773 1.931083 -0.637131 3 B6 12805 B6 JetBlue Airways Jetblue Airways https://www.airlinequality.com/airline-reviews...
3 DL 2667 0.357593 2.567887 2.569762 2.851837 2.887197 2.320190 2.324231 2.224681 -0.432321 4 DL 43959 DL Delta Air Lines Inc. Delta Air Lines https://www.airlinequality.com/airline-reviews...
4 F9 3311 0.158774 1.443140 1.258427 1.772019 1.050761 1.061650 1.361199 1.380248 -0.887043 6 F9 6270 F9 Frontier Airlines Inc. Frontier Airlines https://www.airlinequality.com/airline-reviews...
5 G4 1575 0.228635 1.798883 1.643929 2.144672 1.198675 1.237089 1.711067 1.782222 -0.709206 7 G4 3477 G4 Allegiant Air Allegiant Air https://www.airlinequality.com/airline-reviews...
6 NK 4424 0.205922 1.551877 1.351816 1.938198 1.100457 1.119522 1.555269 1.577662 -0.767631 10 NK 8757 NK Spirit Air Lines Spirit Airlines https://www.airlinequality.com/airline-reviews...
7 OO 11 0.672727 3.272727 1.833333 3.545455 0.750000 NaN 4.142857 3.090909 0.272727 12 OO 34846 OO SkyWest Airlines Inc. SkyWest Airlines https://www.airlinequality.com/airline-reviews...
8 UA 4749 0.278227 2.134851 2.015647 2.399780 2.118077 1.797923 1.925506 1.883765 -0.631501 14 UA 28030 UA United Air Lines Inc. United Airlines https://www.airlinequality.com/airline-reviews...
9 WN 1498 0.363818 2.465083 2.495513 2.859307 2.282158 2.291667 2.362932 2.362725 -0.423231 15 WN 63426 WN Southwest Airlines Co. Southwest Airlines https://www.airlinequality.com/airline-reviews...

Review Performance per Airport¶

In [15]:
df_columns = [
    'reviewCount',
    'seatComfort', 
    'foodBeverage', 
    'cabinService', 
    'inflightEntertainment', 
    'wifiConnectivity', 
    'groundService', 
    'valueForMoney',
    
]
df_better_names = [
    "Review Count",
    "Seat Comfort", 
    "Food & Beverage", 
    "Cabin Service", 
    "Inflight Entertainment", 
    "Wifi Connectivity", 
    "Ground Service", 
    "Value for Money",
    
]
fig = make_subplots(rows=4, cols=2, subplot_titles=df_better_names)
col = [1, 2]*4
row = [1, 1, 2, 2, 3, 3, 4, 4]
for c, r, column, better_column in zip(col, row, df_columns, df_better_names):
    airport_review_merged.sort_values(column, ascending = False, inplace = True)
    trace = go.Bar(
        x=airport_review_merged['CARRIERNAME'], 
        y=airport_review_merged[column],
        marker=dict(
            color = airport_review_merged[column],
            colorscale='viridis'
        )
    )
    fig.add_trace(
        trace, 
        row=r, 
        col=c
    )
fig.update_layout(
    height=1000, 
    title_text="Individual Scoring Attributes of Airline",
    showlegend=False
)
fig.show()

Delay vs Ranking¶

In [16]:
target_airport_df.columns
Out[16]:
Index(['FL_DATE', 'DEP_DELAY', 'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE',
       'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY',
       'LATE_AIRCRAFT_DELAY', 'AIRLINE_CODE', 'Review Count', 'Rating Value',
       'Seat Comfort', 'Food & Beverage', 'Cabin Service',
       'Inflight Entertainment', 'Wifi Connectivity', 'Ground Service',
       'Value for Money', 'Sentiment', 'index_x', 'OP_UNIQUE_CARRIER_origin',
       'count_x', 'CARRIER_x', 'CARRIERNAME_x', 'matched_name_x', 'urls_x',
       'index_y', 'count_y', 'CARRIER_y', 'CARRIERNAME_y', 'matched_name_y',
       'urls_y'],
      dtype='object')
In [17]:
train_columns = [
    'Rating Value',
    'Seat Comfort',
    'Food & Beverage',
    'Cabin Service',
    'Inflight Entertainment',
    'Wifi Connectivity',
    'Ground Service',
    'Value for Money',
    'Sentiment'
]
label_columns = [
    "DEP_DELAY",
    "ARR_DELAY",
    "CARRIER_DELAY",
    "WEATHER_DELAY",
    "NAS_DELAY",
    "SECURITY_DELAY",
    "LATE_AIRCRAFT_DELAY",
]

Delay Data Size >0

In [19]:
cal_df = target_airport_df[['CARRIERNAME_x'] + label_columns]
delay = cal_df.groupby('CARRIERNAME_x')[label_columns].apply(lambda x: (x > 0).mean()*100).reset_index().sort_values("ARR_DELAY", ascending = True)
big_delay = cal_df.groupby('CARRIERNAME_x')[label_columns].apply(lambda x: (x > 60).mean()*100).reset_index().sort_values("ARR_DELAY", ascending = True)
In [21]:
import pandas as pd
import matplotlib.pyplot as plt

df1 = delay
df2 = big_delay

fig, ax = plt.subplots(figsize=(12, 12))

bar_width = 0.35
index = range(len(df1))
bars3 = ax.barh([i for i in index], [100 for _ in range(len(df2['ARR_DELAY']))], bar_width, label='All Flight', color='royalblue')
bars1 = ax.barh(index, df1['ARR_DELAY'], bar_width, label='Arrival Delay > 0', color='pink')
bars2 = ax.barh([i for i in index], df2['ARR_DELAY'], bar_width, label='Arrival Delay > 60', color='red')


# Adding labels, title, and legend
ax.set_xlabel('Percentage (%)')
ax.set_ylabel('Airline')
ax.set_title('(%) of Delay Flights by Airline')
ax.set_yticks([i  for i in index])
ax.set_yticklabels(df1['CARRIERNAME_x'], rotation=0)
ax.legend()

plt.savefig("Delay_by_Airline")
No description has been provided for this image
In [22]:
target_airport_df.columns
Out[22]:
Index(['FL_DATE', 'DEP_DELAY', 'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE',
       'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY',
       'LATE_AIRCRAFT_DELAY', 'AIRLINE_CODE', 'Review Count', 'Rating Value',
       'Seat Comfort', 'Food & Beverage', 'Cabin Service',
       'Inflight Entertainment', 'Wifi Connectivity', 'Ground Service',
       'Value for Money', 'Sentiment', 'index_x', 'OP_UNIQUE_CARRIER_origin',
       'count_x', 'CARRIER_x', 'CARRIERNAME_x', 'matched_name_x', 'urls_x',
       'index_y', 'count_y', 'CARRIER_y', 'CARRIERNAME_y', 'matched_name_y',
       'urls_y'],
      dtype='object')
In [23]:
target_airport_df = target_airport_df[(target_airport_df['FL_DATE'].dt.month >= 5) & (target_airport_df['FL_DATE'].dt.month <= 8)]
target_airport_df.nunique()
Out[23]:
FL_DATE                     1230
DEP_DELAY                   1509
ARR_DELAY                   1559
CANCELLED                      2
CANCELLATION_CODE              5
CARRIER_DELAY               1314
WEATHER_DELAY                704
NAS_DELAY                    653
SECURITY_DELAY               198
LATE_AIRCRAFT_DELAY         1057
AIRLINE_CODE                  10
Review Count                  10
Rating Value                  10
Seat Comfort                  10
Food & Beverage               10
Cabin Service                 10
Inflight Entertainment        10
Wifi Connectivity              9
Ground Service                10
Value for Money               10
Sentiment                     10
index_x                       10
OP_UNIQUE_CARRIER_origin      10
count_x                       10
CARRIER_x                     10
CARRIERNAME_x                 10
matched_name_x                10
urls_x                        10
index_y                       10
count_y                       10
CARRIER_y                     10
CARRIERNAME_y                 10
matched_name_y                10
urls_y                        10
dtype: int64
In [24]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

corr_matrix = target_airport_df.loc[:, train_columns + label_columns].corr()

plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airline Review Score Correlation with Delays')
plt.savefig("Airline Review Correlation with Delay")
No description has been provided for this image
In [32]:
import plotly.express as px

fig = px.box(
    target_airport_df.sample(100000, replace = False).sort_values(["ARR_DELAY"]), 
    x="CARRIERNAME_x", 
    y="ARR_DELAY",
    log_y = True,
    color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
    height=400, 
    title_text=f"Airline vs Arrival Delay",
    showlegend=True
)
fig.show()

Departure Delay Airport Route¶

In [ ]:
target_airport_df
In [ ]:
target_airport_df['Route'] = target_airport_df.apply(lambda row: row['ORIGIN_CITY_NAME'] + " -> " + row['DEST_CITY_NAME'], axis = 1)
target_airport_df.head()
In [ ]:
target_airport_df_temp = target_airport_df.groupby('Route').size().reset_index().rename(columns = {0: 'DEP_DELAY'})
target_airport_df_temp.sort_values("DEP_DELAY", ascending = False, inplace = True)
target_airport_df_temp = target_airport_df_temp.iloc[:10, :]
In [ ]:
import plotly.express as px
import numpy as np

np.random.seed(1)


top10_busy_route = pd.merge(
    target_airport_df, 
    target_airport_df_temp, 
    how = "inner", 
    on = 'Route', 
    suffixes = ["", "_mean"]
).sample(100000, replace = False)
fig = px.box(
    top10_busy_route.sort_values("DEP_DELAY_mean", ascending = False), 
    x="Route", 
    y="DEP_DELAY",
    log_y = True,
    color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
    height=1000, 
    title_text=f"Aiport Departure Delay",
    showlegend=True
)
fig.show()

Cancelled¶

In [33]:
train_columns = [
    'Rating Value',
    'Seat Comfort',
    'Food & Beverage',
    'Cabin Service',
    'Inflight Entertainment',
    'Wifi Connectivity',
    'Ground Service',
    'Value for Money',
    'Sentiment'
]
label_columns = [
    "Carrier",
    "Weather",
    "NAS",
    "Security",
    "Not"
]
In [34]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

temp = pd.concat([
    target_airport_df, 
    pd.get_dummies(target_airport_df['CANCELLATION_CODE'], prefix = "Cancellation")
], axis = 1)

temp.rename(columns = {
    "Cancellation_A": "Carrier",
    "Cancellation_B": "Weather",
    "Cancellation_C": "NAS",
    "Cancellation_D": "Security",
    "Cancellation_Not": "Not"
}, inplace = True)

corr_matrix = temp.loc[:, train_columns + label_columns].corr()

plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Cancellation Reason')
plt.show()
No description has been provided for this image
In [42]:
import pandas as pd
import plotly.express as px

hist_df = target_airport_df.groupby(["CARRIERNAME_x", "CANCELLATION_CODE"]).size().reset_index().rename(columns = {0: "Count"})
hist_df.replace({"A": "Carrier", "B":"Weather", "C":"NAS", "D":"Security"}, inplace = True)
count_df = target_airport_df.groupby(["CARRIERNAME_x"]).size().reset_index().rename(columns = {0: "All_Count"})
hist_df = pd.merge(hist_df, count_df, on = "CARRIERNAME_x")
hist_df['Percent'] = hist_df.apply(lambda row: row['Count']/row['All_Count']*100, axis = 1)
hist_df = hist_df[hist_df['CANCELLATION_CODE'] != "Not"]
hist_df.sort_values(["CARRIERNAME_x", "CANCELLATION_CODE"], inplace = True)
fig = px.histogram(hist_df, x='CARRIERNAME_x', y='Percent', color='CANCELLATION_CODE' ,title='Airline Cancellation Code')
fig.show()
In [37]:
corr_matrix = temp.loc[:, train_columns + label_columns].corr()

plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airline Review Score Correlation with Cancellation Reason')
plt.savefig("Airline Review vs Cancellation")
No description has been provided for this image